# Combine each store quantile (100) x module x year into store quantile x group, weeks across years rounded
# JHL

# Packages 
list.of.packages <- c("folderfun", "data.table", "bit64", "lubridate", "foreign", "dplyr", "ggplot2")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages,repos = "http://cran.us.r-project.org")

require(folderfun)
require(data.table)
require(dplyr)
require(foreign)
require(ggplot2)
require(lubridate)
require(bit64) # Read long integers
setNumericRounding(0)

# Setup paths
setff("path_home")
path_home <- ffpath_home()

dta_path <- sprintf("%s/raw/nielsen",path_home)
save_dir <- paste0(path_home,"/dta/nielsen/sqg_posted_group/%s_%s.rds")
do_path <- sprintf("%s/do/data_prep/01_price_index",path_home)
repository <- sprintf("%s/dta/nielsen/sqmy_posted_module",path_home)

# Open necessary files: module and store info
mod_gp <- data.table(read.dta(sprintf("%s/Product_Hierarchy_1.22.2016.dta",dta_path)))
mod_gp <- mod_gp[,.(product_module_code,product_group_code,department_code)]
stores_0615 <- data.table(read.dta(sprintf("%s/stores_0615.dta",dta_path)))
stores_0615 <- stores_0615[,.(store_code_uc,year,fips_state_code,fips_county_code)]

# Create quantiles by store 
store_codes <- data.table(unique(stores_0615[,store_code_uc]))
setnames(store_codes,"V1","store_code_uc")
store_codes$quantile <- ntile(store_codes$store_code_uc,100)
setkey(store_codes,store_code_uc)
setkey(stores_0615,store_code_uc)
stores_0615 <- store_codes[stores_0615, nomatch=0]

# File table for all tsv files in store quantile x module posted prices
if(!file.exists(sprintf("%s/sqmy_posted_module_0615_files.RData",do_path))){

		setwd(repository)
		
		file.table <- system("ls -S", intern = TRUE)
		
		# Extract name of files, match with module and group codes
		file.table <- data.table(file.table)
		setnames(file.table,"file.table","file_name") 
		file.table[,c("V1","V2","V3"):=tstrsplit(file_name,"_",fixed=T)]
		file.table[,c("V4","V5"):=tstrsplit(V3,".",fixed=T)]
		
		setnames(file.table,"V4","year")
		setnames(file.table,"V2","product_module_code")
		setnames(file.table,"V1","store_quantile")
		file.table[,':='(file_name=paste(repository,"/",file_name,sep=""),V3=NULL,V5=NULL,product_module_code=as.numeric(product_module_code))]
		set.seed(10)
		file.table[,':='(size_id=.I,id=sample(1:nrow(file.table), nrow(file.table), replace=F))]

		setkey(mod_gp, product_module_code)
		setkey(file.table,product_module_code)
		
		file.table <- mod_gp[file.table,nomatch=0]		
		file.table <- file.table[!(product_module_code==8621&product_group_code==5507&year>=2014)]
		file.table <- file.table[!(product_module_code==8621&product_group_code==5516&year<2014)]
		
		file.table[,':='(size_id=sum(size_id),id=sum(id)),by=.(store_quantile,product_group_code)] # Sort by group 
				
		save(file.table, file = sprintf("%s/sqmy_posted_module_0615_files.RData",do_path)) # Save to RData for quick loading 				 
}

load(sprintf("%s/sqmy_posted_module_0615_files.RData",do_path))	

# 1:100
myID <- as.numeric(Sys.getenv("SLURM_ARRAY_TASK_ID"))

setkey(file.table,id)
quant = quantile(unique(file.table$id),seq(0,1,by=0.01))
file.table = file.table[id>=quant[[myID]] & id<=quant[[myID+1]],]

### Bind modules into store quantile x group across years and modules ### 
for(i in unique(file.table[,id])){
	data <- lapply(file.table[id==i,file_name],function(x){data.table(readRDS(x))})
	data <- rbindlist(data)

	data <- data[,decile:=NULL]

	# Recollapse to ensure starting weeks of next year is included, then drop the year end obs
	data[,':='(Price=sum(Units*Price)/sum(Units)),by=.(store_code_uc,upc,upc_ver_uc,year,quarter)]		
	data[,':='(Units=sum(Units)),by=.(store_code_uc,upc,upc_ver_uc,year,quarter)]
	data <- data[year==panel_year]
	# data[,panel_year:=NULL]
	
	# Merge with store quantiles (100)
	setkey(stores_0615,store_code_uc,year)
	setkey(data,store_code_uc,year)
	data <- stores_0615[data,nomatch=0]
	
	group <- unique(file.table[id==i,product_group_code])
	
	lapply(unique(data[,quantile]),function(x){ saveRDS(data[quantile==x],sprintf(save_dir,x,group)) })	
	
	cat(sprintf('Store quantile %s Group %s Done ',unique(file.table[id==i,store_quantile]),group))
	gc()
}